This is a work in progress.


In [90]:
import Bio
from Bio import AlignIO
from Bio import Phylo
import inspect
import os
from itertools import imap, groupby, starmap

#specify dir where alignment files are located
aligned_dir = "Biopython Alignment"

In [4]:
#optional step
alignment = AlignIO.read(os.path.join(aligned_dir,'all_seq.aln'),'clustal')
print alignment


SingleLetterAlphabet() alignment with 8 rows and 13379 columns
---------------ATGGAGAGAATAAAAGAACTGAGAGATCT...TAC h2014
---------------ATGGAGAGAATAAAAGAACTGAGAGATCT...--- s2014
---------------ATGGAGAGAATAAAAGAACTGAGAGATCT...--- h2009
---------------ATGGAGAGAATAAAAGAACTAAGAGATCT...--- s2009
------------AATATGGAAAGAATAAAAGAACTACGAAATCT...--- h1935
--------------TATGGAAAGAATAAAAGAGCTAAGGAGTCT...--- h1978
TCAAATATATTCAATATGGAGAGAATAAAAGAACTAAGGGATCT...--- s1935
TCAAATATATTCAATATGGAGAGAATAAAGGAACTAAGAAATCT...--- s1978

The Tree


In [8]:
#http://biopython.org/wiki/Phylo_cookbook
#http://biopython.org/wiki/Phylo
tree = Phylo.read(os.path.join(aligned_dir,'all_seq.dnd'),"newick")
Phylo.draw_ascii(tree)
print tree


                                                          __________ h1935
                                             ____________|
                                        ____|            |______________ h1978
                                       |    |
                  _____________________|    |_______________ s1935
                 |                     |
  _______________|                     |____________________ s1978
 |               |
 |               |_____________ s2009
_|
 | h2009
 |
 |  _ h2014
 |_|
   |_ s2014

Tree(rooted=False, weight=1.0)
    Clade()
        Clade(branch_length=0.04081)
            Clade(branch_length=0.05117)
                Clade(branch_length=0.01348)
                    Clade(branch_length=0.03165)
                        Clade(branch_length=0.02643, name='h1935')
                        Clade(branch_length=0.03374, name='h1978')
                    Clade(branch_length=0.03667, name='s1935')
                Clade(branch_length=0.05144, name='s1978')
            Clade(branch_length=0.03363, name='s2009')
        Clade(branch_length=0.00479, name='h2009')
        Clade(branch_length=0.00677)
            Clade(branch_length=0.00426, name='h2014')
            Clade(branch_length=0.0042, name='s2014')

In [136]:
def lookup_by_names(tree):
    names = {}
    for clade in tree.find_clades():
        if clade.name:
            if clade.name in names:
                raise ValueError("Duplicate key: %s" % clade.name)
            names[clade.name] = clade
    return names

names = lookup_by_names(tree)
for phylum in ('h2009','s2009'):
    print "Phylum size", len(names[phylum].get_terminals())
    
def tabulate_names(tree):
    names = {}
    for idx, clade in enumerate(tree.find_clades()):
        if clade.name:
            clade.name = '%s_i%d' % (clade.name,idx)
        else:
            clade.name = str(idx)
        names[clade.name] = clade
    return names
tree2 =copy.deepcopy(tree)
tabulate_names(tree2)
Phylo.draw_ascii(tree2)


Phylum size 1
Phylum size 1
                                                       __________ h1935_i5
                                           ___________|
                                      ____|           |_____________ h1978_i6
                                     |    |
                  ___________________|    |_____________ s1935_i7
                 |                   |
  _______________|                   |___________________ s1978_i8
 |               |
 |               |____________ s2009_i9
_|
 | h2009_i10
 |
 |  _ h2014_i12
 |_|
   |_ s2014_i13


In [ ]:
'''
Calculate distances between neighboring terminals
Suggested by Joel Berendzen
'''
import itertools
 
def terminal_neighbor_dists(self):
    """Return a list of distances between adjacent terminals."""
    def generate_pairs(self):
        pairs = itertools.tee(self)
        pairs[1].next()
        return itertools.izip(pairs[0], pairs[1])
    return [self.distance(*i) for i in
            generate_pairs(self.find_clades(terminal=True))]

Compare genomes by Depths

depths method:

Create a mapping of tree clades to depths. The result is a dictionary where the keys are all of the Clade instances in the tree, and the values are the distance from the root to each clade (including terminals). By default the distance is the cumulative branch length leading to the clade, but with the unit_branch_lengths=True option, only the number of branches (levels in the tree) is counted. (http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc161)


In [9]:
tree_depths = tree.depths()
tree_depths


Out[9]:
{Clade(): 0,
 Clade(branch_length=0.04081): 0.04081,
 Clade(branch_length=0.05117): 0.09198,
 Clade(branch_length=0.01348): 0.10546000000000001,
 Clade(branch_length=0.03165): 0.13711,
 Clade(branch_length=0.02643, name='h1935'): 0.16354000000000002,
 Clade(branch_length=0.03374, name='h1978'): 0.17085,
 Clade(branch_length=0.03667, name='s1935'): 0.14213,
 Clade(branch_length=0.05144, name='s1978'): 0.14342,
 Clade(branch_length=0.03363, name='s2009'): 0.07444,
 Clade(branch_length=0.00479, name='h2009'): 0.00479,
 Clade(branch_length=0.00677): 0.00677,
 Clade(branch_length=0.00426, name='h2014'): 0.01103,
 Clade(branch_length=0.0042, name='s2014'): 0.01097}

In [65]:
name_depths_dict = {i.name:tree_depths[i] for i in tree_depths if i.name}
name_depths_dict


Out[65]:
{'h1935': 0.16354000000000002,
 'h1978': 0.17085,
 'h2009': 0.00479,
 'h2014': 0.01103,
 's1935': 0.14213,
 's1978': 0.14342,
 's2009': 0.07444,
 's2014': 0.01097}

In [103]:
#make list of two lists where the first list is the human stains and the second is the swine strains
def group_keys_by_host(keys):
    strains = sorted(keys)
    strains_by_host = [[i for i in group]for key, group in groupby(strains, lambda x: x[0])]
    return strains_by_host
strains_by_host = group_keys_by_host(name_depths_dict.keys())
strains_by_host


Out[103]:
[['h1935', 'h1978', 'h2009', 'h2014'], ['s1935', 's1978', 's2009', 's2014']]

In [77]:
#make list of tuples where the first item is the human stain for one year and the second is the swine strain for same year
strains_by_year = [i for i in zip(strains_by_host[0],strains_by_host[1]) if i[0][1:] == i[1][1:]]
strains_by_year


Out[77]:
[('h1935', 's1935'),
 ('h1978', 's1978'),
 ('h2009', 's2009'),
 ('h2014', 's2014')]

In [128]:
def compare(key_seq, vals_dict):
    ks = copy.deepcopy(key_seq)
    for i in key_seq:
        #print i,": "
        for ii in ks:
            #print '\t', ii,
        ks.remove(i)
        #print
    #print key_seq

#starmap(strains_by_host[0])
strains_by_host = group_keys_by_host(name_depths_dict.keys())
compare(strains_by_host[0], name_depths_dict)


h1935 : 
	h1935 	h1978 	h2009 	h2014
h1978 : 
	h1978 	h2009 	h2014
h2009 : 
	h2009 	h2014
h2014 : 
	h2014
['h1935', 'h1978', 'h2009', 'h2014']

In [146]:
def compare_genomes(genome_pair):
    gen1, gen2 = genome_pair[0],genome_pair[1]
    key = gen1+"-"+gen2
    return key, name_depths_dict[gen1] - name_depths_dict[gen2]

In [151]:
import itertools
strains_by_host = group_keys_by_host(name_depths_dict.keys())
year_diffs={}
for i in strains_by_host:
    matched = list(itertools.combinations(i,2))
    for ii in matched:
        key, val = compare_genomes(ii)
        year_diffs[key] = val
        print key,": ", year_diffs[key]
h, s = group_keys_by_host(year_diffs.keys())


h1935-h1978 :  -0.00731
h1935-h2009 :  0.15875
h1935-h2014 :  0.15251
h1978-h2009 :  0.16606
h1978-h2014 :  0.15982
h2009-h2014 :  -0.00624
s1935-s1978 :  -0.00129
s1935-s2009 :  0.06769
s1935-s2014 :  0.13116
s1978-s2009 :  0.06898
s1978-s2014 :  0.13245
s2009-s2014 :  0.06347

In [149]:
year_diffs = {}  
for i in strains_by_year:
    key, val = compare_genomes(i)
    year_diffs[key] = val
    print key,": ", year_diffs[key]
    
year_diffs


h1935-s1935 :  0.02141
h1978-s1978 :  0.02743
h2009-s2009 :  -0.06965
h2014-s2014 :  6e-05
Out[149]:
[['h1935-s1935', 'h1978-s1978', 'h2009-s2009', 'h2014-s2014']]

In [ ]: